In [12]:
import pandas as pd
import numpy as np

In [13]:
timing_file = '../data/all_results_janus_single_node_1-14-17.csv'
properties_file = '../data/uflorida-features.csv'

In [14]:
timings = pd.read_csv(timing_file, header=0)
properties = pd.read_csv(properties_file, header=0)

Changing the name of the columns to more simple names for ease-of-use.


In [15]:
timings.columns= ['np', 'matrix', 'solver', 'prec', 'status', 'time', 'iters', 'resid']
properties.columns = ['rows', 'cols', 'min_nnz_row', 'row_var', 'col_var', 'diag_var', 'nnz', 'frob_norm', 'symm_frob_norm', 'antisymm_frob_norm', 'one_norm', 'inf_norm', 'symm_inf_norm', 'antisymm_inf_norm', 'max_nnz_row', 'trace', 'abs_trace', 'min_nnz_row', 'avg_nnz_row', 'dummy_rows', 'dummy_rows_kind', 'num_value_symm_1', 'nnz_pattern_symm_1', 'num_value_symm_2', 'nnz_pattern_symm_2', 'row_diag_dom', 'col_diag_dom', 'diag_avg', 'diag_sign', 'diag_nnz', 'lower_bw', 'upper_bw', 'row_log_val_spread', 'col_log_val_spread', 'symm', 'matrix']

Combining the two dataframes into a single dataframe called 'combined.'

Replacing the string data with numerical data.


In [16]:
combined = pd.merge(properties, timings)
combined.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 275286 entries, 0 to 275285
Data columns (total 43 columns):
rows                  275286 non-null int64
cols                  275286 non-null int64
min_nnz_row           275286 non-null int64
row_var               275286 non-null float64
col_var               275286 non-null float64
diag_var              275286 non-null float64
nnz                   275286 non-null int64
frob_norm             275286 non-null float64
symm_frob_norm        275286 non-null float64
antisymm_frob_norm    275286 non-null float64
one_norm              275286 non-null float64
inf_norm              275286 non-null float64
symm_inf_norm         275286 non-null float64
antisymm_inf_norm     275286 non-null float64
max_nnz_row           275286 non-null int64
trace                 275286 non-null float64
abs_trace             275286 non-null float64
min_nnz_row           275286 non-null int64
avg_nnz_row           275286 non-null int64
dummy_rows            275286 non-null int64
dummy_rows_kind       275286 non-null int64
num_value_symm_1      275286 non-null int64
nnz_pattern_symm_1    275286 non-null int64
num_value_symm_2      275286 non-null float64
nnz_pattern_symm_2    275286 non-null float64
row_diag_dom          275286 non-null int64
col_diag_dom          275286 non-null int64
diag_avg              275286 non-null float64
diag_sign             275286 non-null int64
diag_nnz              275286 non-null int64
lower_bw              275286 non-null int64
upper_bw              275286 non-null int64
row_log_val_spread    275286 non-null float64
col_log_val_spread    275286 non-null float64
symm                  275286 non-null int64
matrix                275286 non-null object
np                    275286 non-null int64
solver                275286 non-null object
prec                  275286 non-null object
status                275286 non-null object
time                  275286 non-null float64
iters                 156188 non-null float64
resid                 119113 non-null float64
dtypes: float64(20), int64(19), object(4)
memory usage: 92.4+ MB

In [17]:
combined = combined.dropna()

In [18]:
combined['solver_num'] = combined.solver.map({'FIXED_POINT': 0, 'BICGSTAB': 1, 'MINRES': 2, 'PSEUDOBLOCK_CG': 3, 'PSEUDOBLOCK_STOCHASTIC_CG': 4, 'PSEUDOBLOCK_TFQMR': 5, 'TFQMR': 6, 'LSQR': 7, 'PSEUDOBLOCK_GMRES': 8}).astype(int)
combined['prec_num'] = combined.prec.map({'ILUT': 0, 'RILUK': 1, 'RELAXATION': 2, 'CHEBYSHEV': 3, 'NONE': 4}).astype(int)
combined['status_num'] = combined.status.map({'error': -1, 'unconverged': 0, 'converged': 1}).astype(int)

None of the above should be changed


In [19]:
good = combined[combined.status == 'converged']

In [20]:
good.groupby('solver').size()


Out[20]:
solver
BICGSTAB             6935
FIXED_POINT          2147
MINRES               5241
PSEUDOBLOCK_CG       3593
PSEUDOBLOCK_GMRES    6951
PSEUDOBLOCK_TFQMR    5827
TFQMR                5707
dtype: int64

So let's see how big of a difference there is between TFQMR and P_TFQMR


In [76]:
values = {"TFQMR", "PSEUDOBLOCK_TFQMR"}
tfqmr = good.loc[good.solver.isin(values)]
tfqmr.solver.unique()
tfqmr = tfqmr.drop(tfqmr.columns[:36], axis=1)
tfqmr = tfqmr.drop(tfqmr.columns[-3:], axis=1)
tfqmr.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 11534 entries, 25 to 275273
Data columns (total 7 columns):
np        11534 non-null int64
solver    11534 non-null object
prec      11534 non-null object
status    11534 non-null object
time      11534 non-null float64
iters     11534 non-null float64
resid     11534 non-null float64
dtypes: float64(3), int64(1), object(3)
memory usage: 720.9+ KB

Shows how much difference there is between the two solver


In [77]:
tfqmr = tfqmr.groupby('solver')
tfqmr.describe()


Out[77]:
iters np resid time
solver
PSEUDOBLOCK_TFQMR count 5827.000000 5827.000000 5.827000e+03 5827.000000
mean 669.441737 5.921229 5.432610e-07 1.237965
std 1574.980811 3.809055 3.468492e-07 7.532847
min 1.000000 1.000000 4.560000e-33 0.000746
25% 13.000000 2.000000 2.220000e-07 0.018921
50% 91.000000 6.000000 6.000000e-07 0.085300
75% 382.000000 10.000000 8.735000e-07 0.426127
max 9959.000000 12.000000 1.000000e-06 271.173000
TFQMR count 5707.000000 5707.000000 5.707000e+03 5707.000000
mean 669.346066 5.915017 5.414734e-07 1.134524
std 1583.168929 3.828605 3.471924e-07 7.684904
min 1.000000 1.000000 4.560000e-33 0.000563
25% 13.000000 2.000000 2.190000e-07 0.013752
50% 92.000000 6.000000 5.980000e-07 0.062503
75% 387.000000 10.000000 8.720000e-07 0.306544
max 9959.000000 12.000000 1.000000e-06 288.867000

In [ ]: